Spatial data overview

First, let’s understand how different types of spatial data is handled in R, beginning with vector data (i.e. points, lines and polygons) and ending with a brief raster example.

Polygon data

We will begin by looking at polyogn data. Let’s look at Philadelphia census tracts as an example. Census shapefiles are free to download on the Census website, but we will load a version that has already been converted to an sf object.

library(sf)

# Load philly tracts data
pt_sf <- readRDS("data/philadelphia_tracts.rds")

# Note philly.tracts is an sf ("simple feature") object of type "MULTIPOLYGON"
class(pt_sf) 
## [1] "sf"         "data.frame"
# sf objects can be handled like data frames using standard commands
str(pt_sf)   # view structure
## Classes 'sf' and 'data.frame':   384 obs. of  15 variables:
##  $ OBJECTID  : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ STATEFP10 : Factor w/ 1 level "42": 1 1 1 1 1 1 1 1 1 1 ...
##  $ COUNTYFP10: Factor w/ 1 level "101": 1 1 1 1 1 1 1 1 1 1 ...
##  $ TRACTCE10 : Factor w/ 384 levels "000100","000200",..: 95 96 97 134 135 136 137 138 139 140 ...
##  $ GEOID10   : Factor w/ 384 levels "42101000100",..: 95 96 97 134 135 136 137 138 139 140 ...
##  $ NAME10    : Factor w/ 384 levels "1","10.01","10.02",..: 369 370 371 43 44 46 47 48 49 50 ...
##  $ NAMELSAD10: Factor w/ 384 levels "Census Tract 1",..: 369 370 371 43 44 46 47 48 49 50 ...
##  $ MTFCC10   : Factor w/ 1 level "G5020": 1 1 1 1 1 1 1 1 1 1 ...
##  $ FUNCSTAT10: Factor w/ 1 level "S": 1 1 1 1 1 1 1 1 1 1 ...
##  $ ALAND10   : int  366717 319070 405273 341256 562934 439802 562132 789935 570015 609439 ...
##  $ AWATER10  : int  0 0 0 0 0 0 0 277434 282808 0 ...
##  $ INTPTLAT10: Factor w/ 384 levels "+39.8798897",..: 106 114 113 141 137 132 126 110 120 131 ...
##  $ INTPTLON10: Factor w/ 384 levels "-074.9667387",..: 350 362 370 257 234 208 168 129 113 135 ...
##  $ LOGRECNO  : Factor w/ 384 levels "10335","10336",..: 95 96 97 134 135 136 137 138 139 140 ...
##  $ geometry  :sfc_MULTIPOLYGON of length 384; first list element: List of 1
##   ..$ :List of 1
##   .. ..$ : num [1:36, 1:2] -75.2 -75.2 -75.2 -75.2 -75.2 ...
##   ..- attr(*, "class")= chr [1:3] "XY" "MULTIPOLYGON" "sfg"
##  - attr(*, "sf_column")= chr "geometry"
##  - attr(*, "agr")= Factor w/ 3 levels "constant","aggregate",..: NA NA NA NA NA NA NA NA NA NA ...
##   ..- attr(*, "names")= chr [1:14] "OBJECTID" "STATEFP10" "COUNTYFP10" "TRACTCE10" ...
head(pt_sf)  # view first several rows
## Simple feature collection with 6 features and 14 fields
## Geometry type: MULTIPOLYGON
## Dimension:     XY
## Bounding box:  xmin: -75.24747 ymin: 39.96047 xmax: -75.15853 ymax: 39.98
## Geodetic CRS:  WGS 84
##   OBJECTID STATEFP10 COUNTYFP10 TRACTCE10     GEOID10 NAME10       NAMELSAD10
## 0        1        42        101    009400 42101009400     94  Census Tract 94
## 1        2        42        101    009500 42101009500     95  Census Tract 95
## 2        3        42        101    009600 42101009600     96  Census Tract 96
## 3        4        42        101    013800 42101013800    138 Census Tract 138
## 4        5        42        101    013900 42101013900    139 Census Tract 139
## 5        6        42        101    014000 42101014000    140 Census Tract 140
##   MTFCC10 FUNCSTAT10 ALAND10 AWATER10  INTPTLAT10   INTPTLON10 LOGRECNO
## 0   G5020          S  366717        0 +39.9632709 -075.2322437    10429
## 1   G5020          S  319070        0 +39.9658709 -075.2379140    10430
## 2   G5020          S  405273        0 +39.9655396 -075.2435075    10431
## 3   G5020          S  341256        0 +39.9764504 -075.1771771    10468
## 4   G5020          S  562934        0 +39.9750563 -075.1711846    10469
## 5   G5020          S  439802        0 +39.9735358 -075.1630966    10470
##                         geometry
## 0 MULTIPOLYGON (((-75.22927 3...
## 1 MULTIPOLYGON (((-75.23536 3...
## 2 MULTIPOLYGON (((-75.24343 3...
## 3 MULTIPOLYGON (((-75.17341 3...
## 4 MULTIPOLYGON (((-75.17313 3...
## 5 MULTIPOLYGON (((-75.16141 3...
dim(pt_sf)   # view dimensions
## [1] 384  15
pt_sf[1,]    # select first row
## Simple feature collection with 1 feature and 14 fields
## Geometry type: MULTIPOLYGON
## Dimension:     XY
## Bounding box:  xmin: -75.23681 ymin: 39.96047 xmax: -75.22768 ymax: 39.96609
## Geodetic CRS:  WGS 84
##   OBJECTID STATEFP10 COUNTYFP10 TRACTCE10     GEOID10 NAME10      NAMELSAD10
## 0        1        42        101    009400 42101009400     94 Census Tract 94
##   MTFCC10 FUNCSTAT10 ALAND10 AWATER10  INTPTLAT10   INTPTLON10 LOGRECNO
## 0   G5020          S  366717        0 +39.9632709 -075.2322437    10429
##                         geometry
## 0 MULTIPOLYGON (((-75.22927 3...
head(pt_sf$NAMELSAD10)  # select column by name  
## [1] Census Tract 94  Census Tract 95  Census Tract 96  Census Tract 138
## [5] Census Tract 139 Census Tract 140
## 384 Levels: Census Tract 1 Census Tract 10.01 ... Census Tract 9891
head(pt_sf[,7])         # select column by number
## Simple feature collection with 6 features and 1 field
## Geometry type: MULTIPOLYGON
## Dimension:     XY
## Bounding box:  xmin: -75.24747 ymin: 39.96047 xmax: -75.15853 ymax: 39.98
## Geodetic CRS:  WGS 84
##         NAMELSAD10                       geometry
## 0  Census Tract 94 MULTIPOLYGON (((-75.22927 3...
## 1  Census Tract 95 MULTIPOLYGON (((-75.23536 3...
## 2  Census Tract 96 MULTIPOLYGON (((-75.24343 3...
## 3 Census Tract 138 MULTIPOLYGON (((-75.17341 3...
## 4 Census Tract 139 MULTIPOLYGON (((-75.17313 3...
## 5 Census Tract 140 MULTIPOLYGON (((-75.16141 3...
# We can extract the geometry of philly.tracts with the st_geometry function
pt_geo <- st_geometry(pt_sf)
pt_geo
## Geometry set for 384 features 
## Geometry type: MULTIPOLYGON
## Dimension:     XY
## Bounding box:  xmin: -75.28031 ymin: 39.86747 xmax: -74.95575 ymax: 40.13793
## Geodetic CRS:  WGS 84
## First 5 geometries:
pt_geo[[1]]       # perimeter coordinates for the first census tract of the sf
pt_sf[1,]  # i.e. Census Tract 94
## Simple feature collection with 1 feature and 14 fields
## Geometry type: MULTIPOLYGON
## Dimension:     XY
## Bounding box:  xmin: -75.23681 ymin: 39.96047 xmax: -75.22768 ymax: 39.96609
## Geodetic CRS:  WGS 84
##   OBJECTID STATEFP10 COUNTYFP10 TRACTCE10     GEOID10 NAME10      NAMELSAD10
## 0        1        42        101    009400 42101009400     94 Census Tract 94
##   MTFCC10 FUNCSTAT10 ALAND10 AWATER10  INTPTLAT10   INTPTLON10 LOGRECNO
## 0   G5020          S  366717        0 +39.9632709 -075.2322437    10429
##                         geometry
## 0 MULTIPOLYGON (((-75.22927 3...
pt_geo[[2]]        # perimeter coordinates for the second census tract of the sf
pt_sf[2,]  # i.e. Census Tract 95
## Simple feature collection with 1 feature and 14 fields
## Geometry type: MULTIPOLYGON
## Dimension:     XY
## Bounding box:  xmin: -75.24076 ymin: 39.96148 xmax: -75.23535 ymax: 39.97035
## Geodetic CRS:  WGS 84
##   OBJECTID STATEFP10 COUNTYFP10 TRACTCE10     GEOID10 NAME10      NAMELSAD10
## 1        2        42        101    009500 42101009500     95 Census Tract 95
##   MTFCC10 FUNCSTAT10 ALAND10 AWATER10  INTPTLAT10   INTPTLON10 LOGRECNO
## 1   G5020          S  319070        0 +39.9658709 -075.2379140    10430
##                         geometry
## 1 MULTIPOLYGON (((-75.23536 3...
# Plot the geometry of philly.tracts with the base plot function
plot(pt_geo)

# The base plot function has some aesthetic options we can use to tweak our plots
plot(pt_geo, col = "lemonchiffon2")

plot(pt_geo, lwd = 1, border = "red")

Line data

Next let’s look at an example of line data: streets in Philadelphia with bicycle access. This data was sourced directly from the Philadelphia Bike Network.

bn_sf <- st_read("data/Bike_Network/Bike_Network.shp")  # read shapefile as an sf object
## Reading layer `Bike_Network' from data source 
##   `/Users/tomasoles/Library/CloudStorage/Dropbox/Teaching/AppliedDataAnalysis/applied_data_analysis/3_week/data/Bike_Network/Bike_Network.shp' 
##   using driver `ESRI Shapefile'
## Simple feature collection with 5101 features and 8 fields
## Geometry type: LINESTRING
## Dimension:     XY
## Bounding box:  xmin: -75.26927 ymin: 39.87651 xmax: -74.96572 ymax: 40.124
## Geodetic CRS:  WGS 84
class(bn_sf)  # bn.sf is an sf object, which is a subclass of data.frame
## [1] "sf"         "data.frame"
# Once again, let's access the spatial attributes of this sf object with the st_geometry command. 
bn_geo <- st_geometry(bn_sf)
bn_geo[[2]]  # line segment 2
bn_sf[2,]
## Simple feature collection with 1 feature and 8 fields
## Geometry type: LINESTRING
## Dimension:     XY
## Bounding box:  xmin: -75.25337 ymin: 39.88161 xmax: -75.25149 ymax: 39.88373
## Geodetic CRS:  WGS 84
##   OBJECTID SEG_ID   STREETNAME ST_CODE ONEWAY CLASS         TYPE Shape__Len
## 2        2 100003 BARTRAM  AVE   16120      B     2 Conventional    371.888
##                         geometry
## 2 LINESTRING (-75.25149 39.88...
# Let's plot the bike network data
plot(bn_geo)

Point data

As an example of point data, we will work with crime incidents that occurred in Philadelphia in September 2018. The full publicly available crime incidents database for Philadelphia is maintained by the Philadelphia Police Department and is available on the OpenDataPhilly website.

library(tidyverse)

crime_aux <- read_csv("data/crime_incidents.csv")

crime_aux <- crime_aux[which(!is.na(crime_aux$lat) & !is.na(crime_aux$lng)),]

crime <- st_as_sf(crime_aux, coords = c("lng","lat"))

# The crime data is an sf object of type POINT and includes information on the date, time and offense type for each incident

class(crime)
## [1] "sf"         "tbl_df"     "tbl"        "data.frame"
head(crime)
## Simple feature collection with 6 features and 16 fields
## Geometry type: POINT
## Dimension:     XY
## Bounding box:  xmin: -75.21435 ymin: 39.94692 xmax: -75.05671 ymax: 40.06165
## CRS:           NA
## # A tibble: 6 × 17
##   the_geom                cartodb_id the_geom_webmercator objectid dc_dist psa  
##   <chr>                        <dbl> <chr>                   <dbl>   <dbl> <chr>
## 1 0101000020E610000031D2…    3103180 0101000020110F00006… 14989348       9 2    
## 2 0101000020E6100000FDD7…    3120696 0101000020110F0000E… 13220352      35 3    
## 3 0101000020E6100000A869…    3123784 0101000020110F00007… 17337433       9 4    
## 4 0101000020E6100000AB40…    3123917 0101000020110F0000A… 16519896       2 3    
## 5 0101000020E6100000AB40…    3124050 0101000020110F0000A… 14864200       2 3    
## 6 0101000020E6100000B79E…    3127464 0101000020110F0000F… 15167770       5 1    
## # ℹ 11 more variables: dispatch_date_time <dttm>, dispatch_date <chr>,
## #   dispatch_time <time>, hour <dbl>, dc_key <dbl>, location_block <chr>,
## #   ucr_general <dbl>, text_general_code <chr>, point_x <dbl>, point_y <dbl>,
## #   geometry <POINT>
plot(st_geometry(crime))

# Let's take a look at offense types and use dplyr to filter by offense_type...
table(crime$text_general_code)
## 
##              Aggravated Assault Firearm           Aggravated Assault No Firearm 
##                                    1176                                    2305 
##                      All Other Offenses                                   Arson 
##                                    3309                                     173 
##                Burglary Non-Residential                    Burglary Residential 
##                                     659                                    1310 
##                      Disorderly Conduct             DRIVING UNDER THE INFLUENCE 
##                                      88                                     211 
##                            Embezzlement              Forgery and Counterfeiting 
##                                      58                                      41 
##                                   Fraud                     Gambling Violations 
##                                    1684                                       6 
##                     Homicide - Criminal                  Homicide - Justifiable 
##                                     178                                       1 
##                   Liquor Law Violations                     Motor Vehicle Theft 
##                                      16                                    2430 
##          Narcotic / Drug Law Violations    Offenses Against Family and Children 
##                                     855                                      94 
##                          Other Assaults Other Sex Offenses (Not Commercialized) 
##                                    5923                                     206 
##    Prostitution and Commercialized Vice                      Public Drunkenness 
##                                      63                                       3 
##                                    Rape               Receiving Stolen Property 
##                                     202                                     292 
##                         Robbery Firearm                      Robbery No Firearm 
##                                     728                                    1133 
##                      Theft from Vehicle                                  Thefts 
##                                    4239                                   17354 
##                      Vagrancy/Loitering             Vandalism/Criminal Mischief 
##                                       5                                    2936 
##                       Weapon Violations 
##                                     656
homicide <- filter(crime, text_general_code == "Homicide - Criminal")
fraud <- filter(crime, text_general_code == "Fraud")

# Note subsets of an sf object are also sf objects
class(homicide)
## [1] "sf"         "tbl_df"     "tbl"        "data.frame"
class(fraud)
## [1] "sf"         "tbl_df"     "tbl"        "data.frame"
# Plotting homicide and fraud incidents with the base plot function
plot(st_geometry(homicide))

plot(st_geometry(fraud))

# Points by themselves are not very easy to understand. Let's layer them on top of the tract polygons with add = TRUE
plot(pt_geo)
plot(st_geometry(fraud), col = "blue", alpha = 0.1, add = TRUE)
plot(st_geometry(homicide), col = "red", add = TRUE)
legend("bottomright", legend = c("Fraud", "Homicide"), title = "Offense type:", col = c("blue", "red"), pch = 1, bty = "n")

Raster data

So far, we have considered point, line, and polygon data, all of which fall under the umbrella of vector data types. Rasters are a distinct GIS data type that we will consider only briefly because they cannot be handled with sf methods. We will look at the volcano dataset, which gives topographic information for Maunga Whau (a volcano located in Auckland, New Zealand) on a 10m by 10m grid. Because it is a relatively small raster, we can handle volcano using base functions. Larger rasters should be handled using the raster package.

library(datasets)

# The volcano dataset is a 87x61 matrix
class(volcano)
## [1] "matrix" "array"
str(volcano)
##  num [1:87, 1:61] 100 101 102 103 104 105 105 106 107 108 ...
filled.contour(volcano, color = terrain.colors, asp = 1)

NYS Lyme incidence data

We will use the example of New York State county-aggregated Lyme disease incidence for 2014-2016 to try our hand at spatial analysis. This data is publicly available and can be accessed at the Health Data NY website. Raw data can be downloaded in .csv format. If you’re curious to see how this tabular data can be merged with a New York State county shapefile (available at NYS GIS), you can see how this is done in the ‘prep_nys_lyme_data.R’ script file located in the ‘data’ folder of our project directory. But for now, we’ll start with this data that has already been merged and converted to an ‘sf’ object.

library(sf)

# Load NYS Lyme incidence data
lyme <- readRDS("data/nys_lyme_data.rds")

# Let's take a look at some data attributes
class(lyme)
## [1] "sf"         "data.frame"
#head(lyme)

# Note that the variable Lyme.Incidence.Rate gives the county-level Lyme disease incidence per 100,000 population

# Once again, we can plot the spatial attrbutes of this data
plot(st_geometry(lyme))

This data is an example of regional rate data. An easy way to map this data is using the ‘tmap’ library. Let’s load ‘tmap’ and create an interactive map with just a few lines of code.

library(tmap)

tmap_mode("view")  # set mode to interactive
tm_shape(lyme) +    # specify sf object with geographic attribute of interest
  tm_polygons("Lyme.Incidence.Rate")  # specify column with value of interest

We have a missing value in St. Lawrence county. Let’s remove this row from the data so it doesn’t throw an error later in our analysis.

lyme <- lyme[!is.na(lyme$Lyme.Incidence.Rate),]

# Let's map again 
tmap_mode("view")  # set mode to interactive
tm_shape(lyme) +    # specify sf object with geographic attribute of interest
  tm_polygons("Lyme.Incidence.Rate")  # specify column with value of interest

Global clustering (Moran’s I)

This section was adapted from a tutorial created by Manuel Gimmond, which can be found on his github page.

Assess data distribution

Let’s begin by looking at the distribution of our Lyme incidence rate data. The Moran’s I statistic is not robust to extreme values or outliers so we will need to transform our data if it deviates greatly from a normal distribution.

# Five-number summary 
summary(lyme$Lyme.Incidence.Rate)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1.90   13.50   36.30   80.27   98.50  599.60
# Histogram
hist(lyme$Lyme.Incidence.Rate)

# Boxplot
boxplot(lyme$Lyme.Incidence.Rate, horizontal = TRUE)

Our data is skewed strongly to the right with lots of outliers much greater than the mean. Let’s see if a log transformation can make our data look more normal.

# Create new variable that is the log-transformed incidence rate
lyme$log_lyme_incidence <- log(lyme$Lyme.Incidence.Rate)

# Histogram
hist(lyme$log_lyme_incidence)

# Boxplot
boxplot(lyme$log_lyme_incidence, horizontal = TRUE)

That’s much better! We can see what our log-transformed values look like on a map.

tm_shape(lyme) +    # specify sf object with geographic attribute of interest
  tm_polygons("log_lyme_incidence")  # specify column with value of interest

Define neighboring polygons

Now we’re ready to begin our analysis. The first step is to define “neighboring” polygons. Recall that neighbors can be defined based on contiguity or distance or as the k nearest neighbors to each polygon. We’ll use a queen-case contiguity-based definition, where any contiguous polygon that shares at least one vertex will be considered a neighbor. We can store the neighbors of each one of our polygons by creating an ‘nb’ object using the ‘poly2nb’ function from the ‘spdep’ library.

library(spdep)

# Create nb object from Lyme dataset
lyme_nb <- poly2nb(lyme, queen = T) # queen case
class(lyme_nb)
## [1] "nb"
str(lyme_nb)
## List of 61
##  $ : int [1:6] 11 20 42 45 46 47
##  $ : int [1:4] 5 26 50 60
##  $ : int [1:4] 30 31 41 59
##  $ : int [1:4] 9 12 13 53
##  $ : int [1:4] 2 7 15 60
##  $ : int [1:7] 12 23 34 38 49 54 58
##  $ : int [1:2] 5 15
##  $ : int [1:4] 48 50 53 54
##  $ : int [1:5] 4 12 13 27 39
##  $ : int [1:2] 16 17
##  $ : int [1:5] 1 14 20 42 55
##  $ : int [1:7] 4 6 9 27 34 53 54
##  $ : int [1:7] 4 9 20 39 47 52 55
##  $ : int [1:4] 11 36 40 55
##  $ : int [1:5] 5 7 19 32 60
##  $ : int [1:5] 10 17 21 56 57
##  $ : int [1:3] 10 16 21
##  $ : int [1:4] 21 22 29 45
##  $ : int [1:6] 15 26 28 32 37 60
##  $ : int [1:6] 1 11 13 42 47 55
##  $ : int [1:6] 16 17 18 22 45 56
##  $ : int [1:6] 18 21 25 29 33 39
##  $ : int [1:3] 6 25 38
##  $ : int [1:3] 31 41 43
##  $ : int [1:4] 22 23 33 38
##  $ : int [1:6] 2 19 28 35 50 60
##  $ : int [1:6] 9 12 33 34 38 39
##  $ : int [1:5] 19 26 35 37 58
##  $ : int [1:6] 18 22 39 45 46 47
##  $ : int [1:4] 3 41 51 59
##  $ : int [1:3] 3 24 41
##  $ : int [1:3] 15 19 37
##  $ : int [1:5] 22 25 27 38 39
##  $ : int [1:4] 6 12 27 38
##  $ : int [1:6] 26 28 49 50 58 61
##  $ : int [1:5] 14 40 44 52 55
##  $ : int [1:3] 19 28 32
##  $ : int [1:6] 6 23 25 27 33 34
##  $ : int [1:7] 9 13 22 27 29 33 47
##  $ : int [1:4] 14 36 44 59
##  $ : int [1:5] 3 24 30 31 43
##  $ : int [1:5] 1 11 20 45 57
##  $ : int [1:2] 24 41
##  $ : int [1:3] 36 40 59
##  $ : int [1:8] 1 18 21 29 42 46 56 57
##  $ : int [1:4] 1 29 45 47
##  $ : int [1:6] 1 13 20 29 39 46
##  $ : int [1:5] 8 49 50 54 61
##  $ : int [1:6] 6 35 48 54 58 61
##  $ : int [1:6] 2 8 26 35 48 61
##  $ : int 30
##  $ : int [1:3] 13 36 55
##  $ : int [1:4] 4 8 12 54
##  $ : int [1:6] 6 8 12 48 49 53
##  $ : int [1:6] 11 13 14 20 36 52
##  $ : int [1:4] 16 21 45 57
##  $ : int [1:4] 16 42 45 56
##  $ : int [1:4] 6 28 35 49
##  $ : int [1:4] 3 30 40 44
##  $ : int [1:5] 2 5 15 19 26
##  $ : int [1:4] 35 48 49 50
##  - attr(*, "class")= chr "nb"
##  - attr(*, "region.id")= chr [1:61] "1" "2" "3" "4" ...
##  - attr(*, "call")= language poly2nb(pl = lyme, queen = T)
##  - attr(*, "type")= chr "queen"
##  - attr(*, "sym")= logi TRUE
# View the neighbors of the first polygon
lyme_nb[[1]]
## [1] 11 20 42 45 46 47
lyme$NAME[1]
## [1] "Albany"
lyme$NAME[c(11, 20, 42, 45, 46, 47)]
## [1] "Columbia"    "Greene"      "Rensselaer"  "Saratoga"    "Schenectady"
## [6] "Schoharie"

Assign weights to neighbors

Next, we’ll assign weights to each neighboring polygon. We’ll use the simplest option (’style=“W”), which assigns equal weight to each neighboring polygon. In other words, the weight applied to the neigbors of a polygon will equal 1/(no. of neighbors for that polygon).

# Calculate weights from nb object, we'll specify style = "W" for equal weights
lyme_weights <- nb2listw(lyme_nb, style = "W")
class(lyme_weights)
## [1] "listw" "nb"
# View the weight of the first polygon's neighbors
str(lyme_weights, max.level = 1) # view the structure of lw, we'll set max.level = 1 for easier viewing
## List of 3
##  $ style     : chr "W"
##  $ neighbours:List of 61
##   ..- attr(*, "class")= chr "nb"
##   ..- attr(*, "region.id")= chr [1:61] "1" "2" "3" "4" ...
##   ..- attr(*, "call")= language poly2nb(pl = lyme, queen = T)
##   ..- attr(*, "type")= chr "queen"
##   ..- attr(*, "sym")= logi TRUE
##  $ weights   :List of 61
##   ..- attr(*, "mode")= chr "binary"
##   ..- attr(*, "W")= logi TRUE
##   ..- attr(*, "comp")=List of 1
##  - attr(*, "class")= chr [1:2] "listw" "nb"
##  - attr(*, "region.id")= chr [1:61] "1" "2" "3" "4" ...
##  - attr(*, "call")= language nb2listw(neighbours = lyme_nb, style = "W")
##  - attr(*, "zero.policy")= logi FALSE
lyme_weights$weights[[1]]  # The weights of the neighbors for the first polygon (Albany)
## [1] 0.1666667 0.1666667 0.1666667 0.1666667 0.1666667 0.1666667
                 # Recall that Albany has 6 neighbors

lyme$NAME[2]     # Allegheny
## [1] "Allegany"
lyme_nb[[2]]          # Allegheny has 4 neighbors
## [1]  5 26 50 60
lyme_weights$weights[[2]]  # The weights of the neighbors for Allegheny
## [1] 0.25 0.25 0.25 0.25

Perform hypothesis testing

Now we can calculate the Moran’s I statistic and perform hypothesis testing using ‘moran.test’ (analytical calculation) and ‘moran.mc’ (via Monte Carlo simulations). These functions require that we specify the variable of interest and the list of neighbor weights for each polygon. The option ‘alternative = “greater”’ specifies testing for positive spatial autocorrelation, which is also the default for these functions. The ‘moran.mc’ function also requires that we specify the number of simulations with option ‘nsim’.

# Analytical test - quicker computation but sensitive to irregularly distributed polygons
moran.test(lyme$log_lyme_incidence, lyme_weights, alternative = "greater")
## 
##  Moran I test under randomisation
## 
## data:  lyme$log_lyme_incidence  
## weights: lyme_weights    
## 
## Moran I statistic standard deviate = 8.7379, p-value < 2.2e-16
## alternative hypothesis: greater
## sample estimates:
## Moran I statistic       Expectation          Variance 
##       0.720555645      -0.016666667       0.007118479
# Monte Carlo (MC) simulation is slower but the preferred  method to calculate an accurate p-value
MC <- moran.mc(lyme$log_lyme_incidence, lyme_weights, nsim = 999, alternative = "greater")
MC
## 
##  Monte-Carlo simulation of Moran I
## 
## data:  lyme$log_lyme_incidence 
## weights: lyme_weights  
## number of simulations + 1: 1000 
## 
## statistic = 0.72056, observed rank = 1000, p-value = 0.001
## alternative hypothesis: greater

Next, we’ll have R compute the average neighbor income value for each polygon. These values are often referred to as spatially lagged values.

log_lyme_incidence.lag <- lag.listw(lyme_weights, lyme$log_lyme_incidence)
log_lyme_incidence.lag
##  [1] 5.279184 2.322263 2.528599 4.374581 1.594856 3.496443 1.926349 4.496455
##  [9] 4.185537 4.021645 5.403306 3.925311 5.059918 5.668410 1.717511 3.812259
## [17] 3.896203 3.819425 1.544629 5.137699 3.989122 3.653277 3.248488 2.564676
## [25] 3.600780 2.600797 3.843967 2.163104 4.196165 2.510087 1.763183 1.827523
## [33] 3.858524 3.226352 2.892245 5.120626 1.877202 3.169335 3.897496 4.497368
## [41] 2.376910 5.287309 2.032887 4.765607 4.179317 4.232511 4.645199 4.015905
## [49] 3.583862 3.203023 2.079442 5.010065 4.314726 3.778138 5.388543 4.410235
## [57] 4.868271 2.727233 3.360805 1.890025 3.590111
plot(log_lyme_incidence.lag ~ lyme$log_lyme_incidence, pch=16, asp=1)
M1 <- lm(log_lyme_incidence.lag ~ lyme$log_lyme_incidence)
abline(M1, col="blue")

coef(M1)[2]
## lyme$log_lyme_incidence 
##               0.7205556

We can see the results of the MC simulations graphically by passing the output of MC model to the ‘plot’ function.

plot(MC)

Local clustering (local Moran)

The local Moran statistic is an extension of the Moran’s I for the analysis of local (rather than global) spatial autocorrelation. There are some steps in common with the global clustering analysis we performed previously (for example, we have to calculate neighbor weights again) but there are key differences, due to particularities of the ‘rgeoda’ library.

Assign neighbor weights

We will once again find queen-case contiguous weights, though in ‘rgeoda’ we do this with the ‘queen_weights’ function. Note instead of a list of weights like we saw previously with the ‘nb2listw’ function, ‘queen_weights’ outputs an ‘rgeoda’ ‘Weight’ object, which has some nice features.

library(rgeoda)

# Find queen-case contiguous weights
lyme_gweights <- queen_weights(lyme)
class(lyme_gweights)
## [1] "Weight"
## attr(,"package")
## [1] "rgeoda"
# str function allows us to see a nice summary of the weights object
str(lyme_gweights)
## Reference class 'Weight' [package "rgeoda"] with 9 fields
##  $ gda_w           :Formal class 'p_GeoDaWeight' [package "rgeoda"] with 1 slot
##   .. ..@ pointer:<externalptr> 
##  $ is_symmetric    : logi TRUE
##  $ sparsity        : num 0.0763
##  $ min_neighbors   : int 1
##  $ max_neighbors   : int 8
##  $ num_obs         : int 61
##  $ mean_neighbors  : num 4.66
##  $ median_neighbors: num 5
##  $ has_isolates    : logi FALSE
##  and 27 methods, of which 13 are  possibly relevant:
##    GetNeighbors, GetNeighborSize, GetNeighborWeights, GetPointer, GetSparsity,
##    HasIsolates, initialize, IsSymmetric, SaveToFile, SetNeighbors,
##    SetNeighborsAndWeights, SpatialLag, Update
# See the neighbors of the first polygon (Albany)
get_neighbors(lyme_gweights, 1)
## [1] 11 20 42 45 46 47
lyme$NAME[1]
## [1] "Albany"
lyme$NAME[get_neighbors(lyme_gweights, 1)]
## [1] "Columbia"    "Greene"      "Rensselaer"  "Saratoga"    "Schenectady"
## [6] "Schoharie"
# See the neighbors of the first polygon (Allegany)
get_neighbors(lyme_gweights, 2)
## [1]  5 26 50 60
lyme$NAME[2]
## [1] "Allegany"
lyme$NAME[get_neighbors(lyme_gweights, 2)]
## [1] "Cattaraugus" "Livingston"  "Steuben"     "Wyoming"
# See the neighbor weights of the first and second polygons
get_neighbors_weights(lyme_gweights, 1)
## [1] 1 1 1 1 1 1
get_neighbors_weights(lyme_gweights, 2)
## [1] 1 1 1 1

Calculate Local Moran statistic

Now you can use your ‘geoda’ ‘Weights’ to calculate the Local Moran statistic at each polygon.

# We will coerce our data variable into a one-column data frame because this is the format required by the local_moran function
log_lyme_df <- as.data.frame(lyme$log_lyme_incidence)

# Now we can run the local_moran function
lyme_lisa <- local_moran(lyme_gweights, log_lyme_df)

# local_moran returns a LISA object
class(lyme_lisa)
## [1] "LISA"
## attr(,"package")
## [1] "rgeoda"
# Let's take a closer look at this LISA object
lyme_lisa$lisa_vals  # View local Moran's I values for each polygon
##  [1]  6.303589e-01  1.106106e+00  1.414135e+00  4.771674e-01  1.205222e+00
##  [6]  6.955438e-02  2.297059e+00  3.348814e-01  3.241632e-01 -3.410545e-02
## [11]  2.786406e+00 -1.137256e-02  7.590118e-01  1.810965e+00  2.370781e+00
## [16]  1.539095e-01 -7.265369e-02 -3.917429e-02  1.341750e+00  2.403365e+00
## [21] -5.327627e-02  4.522845e-03 -2.021608e-02  6.452192e-01 -3.709835e-05
## [26]  1.369921e+00 -5.424938e-02  8.254362e-01 -1.846812e-03  9.200084e-01
## [31]  6.151358e-01  2.915899e+00 -6.473286e-02  1.193171e-01  2.607967e-01
## [36]  1.225206e+00  1.797554e+00 -4.994189e-02  2.284720e-01  1.108447e+00
## [41]  1.366207e+00  2.002465e+00  4.258473e-01  4.793295e-01  3.220550e-01
## [46]  1.858658e-01  5.146906e-01  2.360824e-01  4.121139e-03 -3.754591e-02
## [51] -1.523086e-01  9.750675e-01  5.005027e-01  1.171338e-01  1.861588e+00
## [56]  3.084596e-01  8.629906e-01  4.824627e-01  1.943183e-02  1.642308e+00
## [61] -2.581274e-03
lyme_lisa$p_vals     # View pseudo p-values
##  [1] 0.001 0.019 0.046 0.129 0.001 0.425 0.048 0.097 0.156 0.338 0.001 0.222
## [13] 0.002 0.001 0.001 0.309 0.389 0.399 0.001 0.001 0.220 0.438 0.335 0.102
## [25] 0.496 0.024 0.306 0.007 0.148 0.059 0.006 0.010 0.315 0.284 0.085 0.003
## [37] 0.010 0.217 0.261 0.085 0.014 0.001 0.047 0.065 0.099 0.169 0.022 0.242
## [49] 0.482 0.248 0.157 0.020 0.121 0.350 0.002 0.105 0.018 0.096 0.335 0.002
## [61] 0.493

Finally, we can make a map of our results! ‘rgeoda’ includes some nifty functions (‘map_colors’, ‘map_labels’ and ‘map_clusters’ to help us with our mapping).

map_colors <- lisa_colors(lyme_lisa)
map_labels <- lisa_labels(lyme_lisa)
map_clusters <- lisa_clusters(lyme_lisa)

plot(st_geometry(lyme), 
     col=sapply(map_clusters, function(x){return(map_colors[[x+1]])}), 
     border = "#333333", lwd=0.2)
legend('topright', legend = map_labels, fill = map_colors, border = "#eeeeee", cex = 0.7)

Mapping GDP Fall Due to Trade Sanctions on Russia

The ongoing conflict between Russia and Ukraine has caused significant economic disruptions, particularly through sanctions and changes in trade patterns. One way to analyze these disruptions is through Input-Output (I-O) analysis, a method that models the interdependencies between sectors in an economy. By examining output multipliers, we can understand how changes in one sector’s demand affect the entire economy.

Input-Output Analysis and Output Multipliers

The core formula used in I-O analysis to compute total output and output multipliers is derived from the Leontief inverse matrix. The formula is as follows:

\[ \mathbf{X} = (\mathbf{I} - \mathbf{A})^{-1} \mathbf{F} \]

Where: - \(\mathbf{X}\) is the output vector, representing total output for each sector. - \(\mathbf{I}\) is the identity matrix. - \(\mathbf{A}\) is the direct requirements matrix (or technical coefficient matrix), representing the inputs required from other sectors to produce one unit of output. - \((\mathbf{I} - \mathbf{A})^{-1}\) is the Leontief inverse matrix, capturing the total intersectoral dependencies. - \(\mathbf{F}\) is the final demand vector, representing demand for goods and services from external sources (e.g., households, exports).

Direct Requirements Matrix \(\mathbf{A}\)

The elements of the direct requirements matrix \(\mathbf{A}\) are calculated as:

\[ A_{ij} = \frac{Z_{ij}}{X_j} \]

Where: - \(Z_{ij}\) is the amount of output from sector \(i\) used as input by sector \(j\). - \(X_j\) is the total output of sector \(j\).

Leontief Inverse \((\mathbf{I} - \mathbf{A})^{-1}\)

The Leontief inverse matrix accounts for both direct and indirect effects of changes in final demand. It shows how changes in one sector affect the entire economy due to intersectoral linkages.

Output Multipliers

The output multipliers for sector \(j\) represent the total increase in output across all sectors from a unit increase in the final demand of sector \(j\). These multipliers are computed by summing the column elements of the Leontief inverse matrix:

\[ \text{Output Multiplier for Sector } j = \sum_{i} (\mathbf{I} - \mathbf{A})^{-1}_{ij} \]

This formula helps quantify how changes in demand for products or services in one sector lead to changes in the overall economic output, showing the ripple effect through the supply chain.

# Load necessary libraries
library(tidyverse)   # for data wrangling
library(tmaptools)   # for thematic mapping tools
library(tmap)        # for creating maps
library(ggplot2)     # for data visualization
library(leaflet)     # for interactive maps
library(sf)          # for handling spatial data
library(dplyr)       # for data manipulation
library(raster)      # for working with raster data

Data Loading

Note: that the output-output multipliers has already been computed for all economies.

# Load the CSV file with header
GVC_EU <- read.csv("data/outputs_Russia.csv", header = TRUE, sep=",", blank.lines.skip = FALSE)

# Rename the first column to "CNTR_ID"
names(GVC_EU)[1] <- "CNTR_ID"

Load Shapefile

# Load the shapefile for countries
auxmap <- st_read("data/Countries_SHP1_3mil/CNTR_RG_03M_2016_3035.shp", 
                  stringsAsFactors = FALSE)
## Reading layer `CNTR_RG_03M_2016_3035' from data source 
##   `/Users/tomasoles/Library/CloudStorage/Dropbox/Teaching/AppliedDataAnalysis/applied_data_analysis/3_week/data/Countries_SHP1_3mil/CNTR_RG_03M_2016_3035.shp' 
##   using driver `ESRI Shapefile'
## Simple feature collection with 257 features and 5 fields
## Geometry type: MULTIPOLYGON
## Dimension:     XY
## Bounding box:  xmin: -7143416 ymin: -9160773 xmax: 16936900 ymax: 15428320
## Projected CRS: ETRS89-extended / LAEA Europe
# Display the structure of the shapefile to understand its content
str(auxmap)
## Classes 'sf' and 'data.frame':   257 obs. of  6 variables:
##  $ CNTR_ID  : chr  "AD" "AE" "AF" "AG" ...
##  $ CNTR_NAME: chr  "Andorra" "الإمارات العربية المتحدة" "افغانستان-افغانستان" "Antigua and Barbuda" ...
##  $ NAME_ENGL: chr  "Andorra" "United Arab Emirates" "Afghanistan" "Antigua and Barbuda" ...
##  $ ISO3_CODE: chr  "AND" "ARE" "AFG" "ATG" ...
##  $ FID      : chr  "AD" "AE" "AF" "AG" ...
##  $ geometry :sfc_MULTIPOLYGON of length 257; first list element: List of 1
##   ..$ :List of 1
##   .. ..$ : num [1:35, 1:2] 3640254 3638125 3635523 3634181 3629280 ...
##   ..- attr(*, "class")= chr [1:3] "XY" "MULTIPOLYGON" "sfg"
##  - attr(*, "sf_column")= chr "geometry"
##  - attr(*, "agr")= Factor w/ 3 levels "constant","aggregate",..: NA NA NA NA NA
##   ..- attr(*, "names")= chr [1:5] "CNTR_ID" "CNTR_NAME" "NAME_ENGL" "ISO3_CODE" ...

Merge Data On Economic Impact with Shapefil

# Perform an inner join between the shapefile data and GVC_EU data
aux_map_data <- inner_join(auxmap, GVC_EU)

Visualizing GDP Fall Due to Sanctions

Create a Quatile Map

# Create a choropleth map based on GDP fall using quantile style
auxQmap1 <- tm_shape(aux_map_data) +
  tm_fill(col = "gGDPc",   
          style = "quantile", 
          n = 8, 
          legend.hist = TRUE, 
          palette = "YlOrRd", 
          title = "GDP Fall Due to Trade Sanctions on Russia", 
          alpha = 0.6) +
  tm_layout(legend.outside = TRUE)

# Set tmap mode to view (interactive map mode)
tmap_mode("view")

# Display the map with adjusted view
auxQmap1 + tm_view(set.view = c(7, 51, 4), text.size.variable = TRUE)
# Return to current mode
#tmap_mode(current.mode)

Hierarchical Clustering Style

# Create a choropleth map using hierarchical clustering style for a different view
auxQmap2 <- tm_shape(aux_map_data) +
  tm_fill(col = "gGDPc",   
          style = "hclust", 
          n = 8, 
          legend.hist = TRUE, 
          palette = "YlOrRd", 
          title = "GDP Fall Due to Trade Sanctions on Russia", 
          alpha = 0.6) +
  tm_layout(legend.outside = TRUE)

# Set tmap mode to view (interactive map mode)
tmap_mode("view")

# Display the map with hierarchical clustering
auxQmap2 + tm_view(set.view = c(7, 51, 4), text.size.variable = TRUE)

Mapping Trade in Intermediate and Final Products

Intermediate Products

# Create a map for trade in intermediate products and its impact on GDP
auxQmap3 <- tm_shape(aux_map_data) +
  tm_fill(col = "gGDPc_A",   
          style = "hclust", 
          n = 8, 
          legend.hist = TRUE, 
          palette = "YlOrRd", 
          title = "GDP Impact Due to Trade Sanctions in Intermediate Products", 
          alpha = 0.6) +
  tm_layout(legend.outside = TRUE)

# Display the map in interactive view
tmap_mode("view")
auxQmap3 + tm_view(set.view = c(7, 51, 4), text.size.variable = TRUE)

Intermediate products

# Create a map for trade in intermediate products and its impact on GDP
auxQmap3 <- tm_shape(aux_map_data) +
  tm_fill(col = "gGDPc_A",   
          style = "hclust", 
          n = 8, 
          legend.hist = TRUE, 
          palette = "YlOrRd", 
          title = "GDP Impact Due to Trade Sanctions in Intermediate Products", 
          alpha = 0.6) +
  tm_layout(legend.outside = TRUE)

# Display the map in interactive view
tmap_mode("view")
auxQmap3 + tm_view(set.view = c(7, 51, 4), text.size.variable = TRUE)

Final Products

# Create a map for trade in final products and its impact on GDP
auxQmap4 <- tm_shape(aux_map_data) +
  tm_fill(col = "gGDPc_Y",   
          style = "hclust", 
          n = 8, 
          legend.hist = TRUE, 
          palette = "YlOrRd", 
          title = "GDP Impact Due to Trade Sanctions in Final Products", 
          alpha = 0.6) +
  tm_layout(legend.outside = TRUE)

# Display the map in interactive view
tmap_mode("view")
auxQmap4 + tm_view(set.view = c(7, 51, 4), text.size.variable = TRUE)

Maps in ggplot

library(maps)
library(countrycode)
library(gapminder)

dat_map <- map_data("world")
ggplot(dat_map, aes(x = long, y = lat,
group = group)) +
geom_polygon(fill = "white", colour = "black")

#Plot the first shot
gdp_map <- full_join(x=dat_map,y=gapminder, by= c("region"="country"))
ggplot(gdp_map, aes(x = long, y = lat,
                                       group = group, fill = log10(gdpPercap))) +
  geom_polygon() +
  scale_fill_gradient(low = "red", high = "green")

#Too much countries are missing 
gapminder$ccode <- countrycode(gapminder$country,
origin = "country.name",
destination = "wb")
dat_map$ccode <- countrycode(dat_map$region,
origin = "country.name",
destination = "wb")
gdp_map_codes <- full_join(x=dat_map,y=gapminder, by="ccode")


#Plot again after merging 
ggplot(gdp_map_codes, aes(x = long, y = lat,
                                       group = group, fill = log10(gdpPercap))) +
  geom_polygon() +
  scale_fill_gradient(low = "red", high = "green")

Dynamic graphs (examples)

library(tidyverse)
library(gapminder)
library(echarts4r)
library(gganimate)
library(ggiraph)
library(widgetframe)
library(ggthemes)
library(plotly)
library(viridis)
library(DT)

# country codes in gapminder::country_codes

gapminder_codes <- gapminder::country_codes

# countries with info in gapminder::gapminder_unfiltered

gapminder <-gapminder::gapminder_unfiltered

# We join both datasets with inner_join to get a dataset with the info by country, continent and country-code

gapminder <- gapminder %>%
  inner_join(gapminder_codes, by= "country") %>%
  mutate(code = iso_alpha)

# A map of the world (Antarctica removed)

world <- map_data("world") %>%
  filter(region != "Antarctica")


gapminder_data <- gapminder %>%
  inner_join(maps::iso3166 %>%
               select(a3, mapname), by= c(code = "a3")) %>%
  mutate(mapname = str_remove(mapname, "\\(.*"))


datagpmnd <- gapminder %>%
  mutate(Name = recode_factor(country,
                              `Congo, Dem. Rep.`= "Dem. Rep. Congo",
                              `Congo, Rep.`= "Congo",
                              `Cote d'Ivoire`= "Côte d'Ivoire",
                              `Central African Republic`= "Central African Rep.",
                              `Yemen, Rep.`= "Yemen",
                              `Korea, Rep.`= "Korea",
                              `Korea, Dem. Rep.`= "Dem. Rep. Korea",
                              `Czech Republic`= "Czech Rep.",
                              `Slovak Republic`= "Slovakia",
                              `Dominican Republic`= "Dominican Rep.",
                              `Equatorial Guinea`= "Eq. Guinea"))


datagpmnd %>%
  group_by(year) %>%
  e_chart(Name, timeline = TRUE) %>%
  e_map(lifeExp) %>%
  e_visual_map(min= 30, max= 90,
               type = 'piecewise') %>%
  e_title("Life expectancy by country and year", left = "center") %>%
  e_tooltip(
    trigger = "item",
    formatter = e_tooltip_choro_formatter())